

# for this analysis, we first sort the raw data by pre-post, and then within post sort by DataManagement_YesNo, and save
# the post low (==0) in the file LowData.txt and the post high (==1) in the file HighData.txt using DM_personalPolcyText.  



# from: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know


library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")

setwd("C:/Users/Kevin/Dropbox/iREDS/Data/Analysis/single_items/wordclouds")
#text.low <- readLines(file.choose())
text.low <- readLines("LowData.txt")
doc.low <- Corpus(VectorSource(text.low))

# inspect(doc.low)


toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
doc.low <- tm_map(doc.low, toSpace, "/")
doc.low <- tm_map(doc.low, toSpace, "@")
doc.low <- tm_map(doc.low, toSpace, "\\|")

# Convert the text to lower case
doc.low <- tm_map(doc.low, content_transformer(tolower))
# Remove numbers
doc.low <- tm_map(doc.low, removeNumbers)
# Remove english common stopwords
doc.low <- tm_map(doc.low, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
 doc.low <- tm_map(doc.low, removeWords, c("data", "lab", "drive", "notebook", "hard", "keep")) 
# Remove punctuations
doc.low <- tm_map(doc.low, removePunctuation)
# Eliminate extra white spaces
doc.low <- tm_map(doc.low, stripWhitespace)
# Text stemming
# doc.low <- tm_map(doc.low, stemDocument)



dtm.low <- TermDocumentMatrix(doc.low)
m.low <- as.matrix(dtm.low)
v.low <- sort(rowSums(m.low),decreasing=TRUE)
d.low <- data.frame(word = names(v.low),freq=v.low)
#head(d.low, 10)


#text.high <- readLines(file.choose())
text.high <- readLines("HighData.txt")
doc.high <- Corpus(VectorSource(text.high))

# inspect(doc.high)


toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
doc.high <- tm_map(doc.high, toSpace, "/")
doc.high <- tm_map(doc.high, toSpace, "@")
doc.high <- tm_map(doc.high, toSpace, "\\|")

# Convert the text to lower case
doc.high <- tm_map(doc.high, content_transformer(tolower))
# Remove numbers
doc.high <- tm_map(doc.high, removeNumbers)
# Remove english common stopwords
doc.high <- tm_map(doc.high, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
doc.high <- tm_map(doc.high, removeWords, c("data", "lab", "drive", "notebook", "hard", "keep")) 
# Remove punctuations
doc.high <- tm_map(doc.high, removePunctuation)
# Eliminate extra white spaces
doc.high <- tm_map(doc.high, stripWhitespace)
# Text stemming
# doc.high <- tm_map(doc.high, stemDocument)



dtm.high <- TermDocumentMatrix(doc.high)
m.high <- as.matrix(dtm.high)
v.high <- sort(rowSums(m.high),decreasing=TRUE)
d.high <- data.frame(word = names(v.high),freq=v.high)
head(d.high, 10)
head(d.low, 10)





set.seed(8275591) # my office phone number

wordcloud(words = d.low$word, freq = d.low$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

wordcloud(words = d.high$word, freq = d.high$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))







